import os
import gzip
from collections import defaultdict
import pybedtools
from numpy import *


assembly = "hg38"

directory = "/osc-fs_home/scratch/mdehoon/Data/Genomes"
filename = "%s.chrom.sizes" % assembly
path = os.path.join(directory, assembly, filename)
handle = open(path)
sizes = {}
for line in handle:
    chromosome, size = line.split()
    sizes[chromosome] = int(size)
handle.close()

directory = "/osc-fs_home/mdehoon/Data/Fantom6/FANTOMCAT"
filename = "F6_CAT.promoter.bed.gz"
path = os.path.join(directory, filename)
handle = gzip.open(path, "rt")
promoters = pybedtools.BedTool(handle)
loci = defaultdict(list)
for promoter in promoters:
    chromosome = promoter.chrom
    loci[chromosome].append(promoter)
handle.close()

def find_expression(data, promoter):
    start = promoter.start
    end = promoter.end
    strand = promoter.strand
    expression = data[strand][start:end]
    positions = argsort(expression)[::-1]
    maximum = expression[positions[0]] - 1.e-6  # for roundoff
    candidates = []
    for position in positions:
        if expression[position] < maximum:
            break
        candidates.append(position)
    position = int(round(median(candidates)))
    promoter.start += position
    promoter.end = promoter.start + 1
    promoter.score = str(sum(expression))

filename = "CAGE.ctss.bed"
handle = open(filename)
lines = pybedtools.BedTool(handle)
current = None
promoters = []
for line in lines:
    if line.chrom != current:
        if current is not None:
            for promoter in loci[current]:
                find_expression(data, promoter)
                promoters.append(promoter)
        current = line.chrom
        size = sizes[current]
        data = {'+': zeros(size), '-': zeros(size)}
    strand = line.strand
    position = line.start
    count = float(line.score)
    data[strand][position] += count
handle.close()
for promoter in loci[current]:
    find_expression(data, promoter)
    promoters.append(promoter)

filename = "promoters.FANTOM_CAT.THP-1.bed"
print("Writing", filename)
output = open(filename, 'w')
for promoter in promoters:
    output.write(str(promoter))
output.close()
